{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true,
    "pycharm": {
     "is_executing": false
    }
   },
   "outputs": [],
   "source": [
    "import pathlib\n",
    "\n",
    "import numpy as np\n",
    "import scipy.sparse\n",
    "import scipy.io\n",
    "import pandas as pd\n",
    "import pickle\n",
    "from sklearn.feature_extraction.text import CountVectorizer\n",
    "import networkx as nx\n",
    "import utils.preprocess\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS as sklearn_stopwords\n",
    "from nltk import word_tokenize\n",
    "from nltk.corpus import stopwords as nltk_stopwords\n",
    "from nltk.stem import WordNetLemmatizer\n",
    "\n",
    "from utils.data import load_glove_vectors"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "outputs": [],
   "source": [
    "save_prefix = 'data/preprocessed/DBLP_processed/'\n",
    "num_ntypes = 4"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n",
     "is_executing": false
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "outputs": [
    {
     "name": "stdout",
     "text": [
      "Loading GloVe pretrained word vectors\n",
      "Done. 400000 words loaded!\n"
     ],
     "output_type": "stream"
    }
   ],
   "source": [
    "author_label = pd.read_csv('data/raw/DBLP/author_label.txt', sep='\\t', header=None, names=['author_id', 'label', 'author_name'], keep_default_na=False, encoding='utf-8')\n",
    "paper_author = pd.read_csv('data/raw/DBLP/paper_author.txt', sep='\\t', header=None, names=['paper_id', 'author_id'], keep_default_na=False, encoding='utf-8')\n",
    "paper_conf = pd.read_csv('data/raw/DBLP/paper_conf.txt', sep='\\t', header=None, names=['paper_id', 'conf_id'], keep_default_na=False, encoding='utf-8')\n",
    "paper_term = pd.read_csv('data/raw/DBLP/paper_term.txt', sep='\\t', header=None, names=['paper_id', 'term_id'], keep_default_na=False, encoding='utf-8')\n",
    "papers = pd.read_csv('data/raw/DBLP/paper.txt', sep='\\t', header=None, names=['paper_id', 'paper_title'], keep_default_na=False, encoding='cp1252')\n",
    "terms = pd.read_csv('data/raw/DBLP/term.txt', sep='\\t', header=None, names=['term_id', 'term'], keep_default_na=False, encoding='utf-8')\n",
    "confs = pd.read_csv('data/raw/DBLP/conf.txt', sep='\\t', header=None, names=['conf_id', 'conf'], keep_default_na=False, encoding='utf-8')\n",
    "\n",
    "glove_dim = 50\n",
    "glove_vectors = load_glove_vectors(dim=glove_dim)"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n",
     "is_executing": false
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "outputs": [],
   "source": [
    "# filter out all nodes which does not associated with labeled authors\n",
    "labeled_authors = author_label['author_id'].to_list()\n",
    "paper_author = paper_author[paper_author['author_id'].isin(labeled_authors)].reset_index(drop=True)\n",
    "valid_papers = paper_author['paper_id'].unique()\n",
    "papers = papers[papers['paper_id'].isin(valid_papers)].reset_index(drop=True)\n",
    "paper_conf = paper_conf[paper_conf['paper_id'].isin(valid_papers)].reset_index(drop=True)\n",
    "paper_term = paper_term[paper_term['paper_id'].isin(valid_papers)].reset_index(drop=True)\n",
    "valid_terms = paper_term['term_id'].unique()\n",
    "terms = terms[terms['term_id'].isin(valid_terms)].reset_index(drop=True)"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n",
     "is_executing": false
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "outputs": [],
   "source": [
    "# term lemmatization and grouping\n",
    "lemmatizer = WordNetLemmatizer()\n",
    "lemma_id_mapping = {}\n",
    "lemma_list = []\n",
    "lemma_id_list = []\n",
    "i = 0\n",
    "for _, row in terms.iterrows():\n",
    "    i += 1\n",
    "    lemma = lemmatizer.lemmatize(row['term'])\n",
    "    lemma_list.append(lemma)\n",
    "    if lemma not in lemma_id_mapping:\n",
    "        lemma_id_mapping[lemma] = row['term_id']\n",
    "    lemma_id_list.append(lemma_id_mapping[lemma])\n",
    "terms['lemma'] = lemma_list\n",
    "terms['lemma_id'] = lemma_id_list\n",
    "\n",
    "term_lemma_mapping = {row['term_id']: row['lemma_id'] for _, row in terms.iterrows()}\n",
    "lemma_id_list = []\n",
    "for _, row in paper_term.iterrows():\n",
    "    lemma_id_list.append(term_lemma_mapping[row['term_id']])\n",
    "paper_term['lemma_id'] = lemma_id_list\n",
    "\n",
    "paper_term = paper_term[['paper_id', 'lemma_id']]\n",
    "paper_term.columns = ['paper_id', 'term_id']\n",
    "paper_term = paper_term.drop_duplicates()\n",
    "terms = terms[['lemma_id', 'lemma']]\n",
    "terms.columns = ['term_id', 'term']\n",
    "terms = terms.drop_duplicates()"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n",
     "is_executing": false
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "outputs": [],
   "source": [
    "# filter out stopwords from terms\n",
    "stopwords = sklearn_stopwords.union(set(nltk_stopwords.words('english')))\n",
    "stopword_id_list = terms[terms['term'].isin(stopwords)]['term_id'].to_list()\n",
    "paper_term = paper_term[~(paper_term['term_id'].isin(stopword_id_list))].reset_index(drop=True)\n",
    "terms = terms[~(terms['term'].isin(stopwords))].reset_index(drop=True)"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n",
     "is_executing": false
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "outputs": [],
   "source": [
    "# remove terms not found in GloVe\n",
    "#terms_not_in_glove = []\n",
    "#for _, row in terms.iterrows():\n",
    "#    if row['term'] not in glove_vectors:\n",
    "#        terms_not_in_glove.append(row['term'])\n",
    "#term_ids_not_in_glove = terms[terms['term'].isin(terms_not_in_glove)]['term_id'].to_list()\n",
    "#terms = terms[~(terms['term'].isin(terms_not_in_glove))].reset_index(drop=True)\n",
    "#paper_term = paper_term[~(paper_term['term_id'].isin(term_ids_not_in_glove))].reset_index(drop=True)"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n",
     "is_executing": false
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "outputs": [],
   "source": [
    "# consider only terms associated with at least two papers\n",
    "# if having meaningful word vectors for terms (e.g. GloVe)?"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n",
     "is_executing": false
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "outputs": [],
   "source": [
    "author_label = author_label.sort_values('author_id').reset_index(drop=True)\n",
    "papers = papers.sort_values('paper_id').reset_index(drop=True)\n",
    "terms = terms.sort_values('term_id').reset_index(drop=True)\n",
    "confs = confs.sort_values('conf_id').reset_index(drop=True)"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n",
     "is_executing": false
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "outputs": [],
   "source": [
    "# extract labels of authors\n",
    "labels = author_label['label'].to_numpy()"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n",
     "is_executing": false
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "outputs": [],
   "source": [
    "# build the adjacency matrix for the graph consisting of authors, papers, terms and conferences\n",
    "# 0 for authors, 1 for papers, 2 for terms, 3 for conferences\n",
    "dim = len(author_label) + len(papers) + len(terms) + len(confs)\n",
    "type_mask = np.zeros((dim), dtype=int)\n",
    "type_mask[len(author_label):len(author_label)+len(papers)] = 1\n",
    "type_mask[len(author_label)+len(papers):len(author_label)+len(papers)+len(terms)] = 2\n",
    "type_mask[len(author_label)+len(papers)+len(terms):] = 3\n",
    "\n",
    "author_id_mapping = {row['author_id']: i for i, row in author_label.iterrows()}\n",
    "paper_id_mapping = {row['paper_id']: i + len(author_label) for i, row in papers.iterrows()}\n",
    "term_id_mapping = {row['term_id']: i + len(author_label) + len(papers) for i, row in terms.iterrows()}\n",
    "conf_id_mapping = {row['conf_id']: i + len(author_label) + len(papers) + len(terms) for i, row in confs.iterrows()}\n",
    "\n",
    "adjM = np.zeros((dim, dim), dtype=int)\n",
    "for _, row in paper_author.iterrows():\n",
    "    idx1 = paper_id_mapping[row['paper_id']]\n",
    "    idx2 = author_id_mapping[row['author_id']]\n",
    "    adjM[idx1, idx2] = 1\n",
    "    adjM[idx2, idx1] = 1\n",
    "for _, row in paper_term.iterrows():\n",
    "    idx1 = paper_id_mapping[row['paper_id']]\n",
    "    idx2 = term_id_mapping[row['term_id']]\n",
    "    adjM[idx1, idx2] = 1\n",
    "    adjM[idx2, idx1] = 1\n",
    "for _, row in paper_conf.iterrows():\n",
    "    idx1 = paper_id_mapping[row['paper_id']]\n",
    "    idx2 = conf_id_mapping[row['conf_id']]\n",
    "    adjM[idx1, idx2] = 1\n",
    "    adjM[idx2, idx1] = 1"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n",
     "is_executing": false
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "outputs": [],
   "source": [
    "# use HAN paper's preprocessed data as the features of authors (https://github.com/Jhy1993/HAN)\n",
    "mat = scipy.io.loadmat('data/raw/DBLP/DBLP4057_GAT_with_idx.mat')\n",
    "features_author = np.array(list(zip(*sorted(zip(labeled_authors, mat['features']), key=lambda tup: tup[0])))[1])\n",
    "features_author = scipy.sparse.csr_matrix(features_author)"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n",
     "is_executing": false
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "outputs": [
    {
     "name": "stderr",
     "text": [
      "C:\\Users\\xyfu\\Anaconda3\\envs\\dgl-gpu\\lib\\site-packages\\sklearn\\feature_extraction\\text.py:300: UserWarning: Your stop_words may be inconsistent with your preprocessing. Tokenizing the stop words generated tokens [\"'d\", \"'ll\", \"'re\", \"'s\", \"'ve\", 'doe', 'ha', 'le', \"n't\", 'need', 'sha', 'u', 'wa', 'wo'] not in stop_words.\n  'stop_words.' % sorted(inconsistent))\n"
     ],
     "output_type": "stream"
    }
   ],
   "source": [
    "# use bag-of-words representation of paper titles as the features of papers\n",
    "class LemmaTokenizer:\n",
    "    def __init__(self):\n",
    "        self.wnl = WordNetLemmatizer()\n",
    "    def __call__(self, doc):\n",
    "        return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]\n",
    "vectorizer = CountVectorizer(min_df=2, stop_words=stopwords, tokenizer=LemmaTokenizer())\n",
    "features_paper = vectorizer.fit_transform(papers['paper_title'].values)"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n",
     "is_executing": false
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "outputs": [],
   "source": [
    "# use pretrained GloVe vectors as the features of terms\n",
    "features_term = np.zeros((len(terms), glove_dim))\n",
    "for i, row in terms.iterrows():\n",
    "    features_term[i] = glove_vectors.get(row['term'], glove_vectors['the'])"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n",
     "is_executing": false
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "outputs": [
    {
     "name": "stdout",
     "text": [
      "(32789, 3)\n",
      "(41633537, 5)\n",
      "(30803571, 5)\n"
     ],
     "output_type": "stream"
    }
   ],
   "source": [
    "expected_metapaths = [\n",
    "    [(0, 1, 0), (0, 1, 2, 1, 0), (0, 1, 3, 1, 0)],\n",
    "    [(1, 0, 1), (1, 2, 1), (1, 3, 1)],\n",
    "    [(2, 1, 2), (2, 1, 0, 1, 2), (2, 1, 3, 1, 2)],\n",
    "    [(3, 1, 3), (3, 1, 0, 1, 3), (3, 1, 2, 1, 3)]\n",
    "]\n",
    "# create the directories if they do not exist\n",
    "for i in range(1):\n",
    "    pathlib.Path(save_prefix + '{}'.format(i)).mkdir(parents=True, exist_ok=True)\n",
    "for i in range(1):\n",
    "    # get metapath based neighbor pairs\n",
    "    neighbor_pairs = utils.preprocess.get_metapath_neighbor_pairs(adjM, type_mask, expected_metapaths[i])\n",
    "    # construct and save metapath-based networks\n",
    "    G_list = utils.preprocess.get_networkx_graph(neighbor_pairs, type_mask, i)\n",
    "    \n",
    "    # save data\n",
    "    # networkx graph (metapath specific)\n",
    "    for G, metapath in zip(G_list, expected_metapaths[i]):\n",
    "        nx.write_adjlist(G, save_prefix + '{}/'.format(i) + '-'.join(map(str, metapath)) + '.adjlist')\n",
    "    # node indices of edge metapaths\n",
    "    all_edge_metapath_idx_array = utils.preprocess.get_edge_metapath_idx_array(neighbor_pairs)\n",
    "    for metapath, edge_metapath_idx_array in zip(expected_metapaths[i], all_edge_metapath_idx_array):\n",
    "        np.save(save_prefix + '{}/'.format(i) + '-'.join(map(str, metapath)) + '_idx.npy', edge_metapath_idx_array)\n",
    "# save data\n",
    "# all nodes adjacency matrix\n",
    "scipy.sparse.save_npz(save_prefix + 'adjM.npz', scipy.sparse.csr_matrix(adjM))\n",
    "# all nodes (authors, papers, terms and conferences) features\n",
    "# currently only have features of authors, papers and terms\n",
    "scipy.sparse.save_npz(save_prefix + 'features_{}.npz'.format(0), features_author)\n",
    "scipy.sparse.save_npz(save_prefix + 'features_{}.npz'.format(1), features_paper)\n",
    "np.save(save_prefix + 'features_{}.npy'.format(2), features_term)\n",
    "# all nodes (authors, papers, terms and conferences) type labels\n",
    "np.save(save_prefix + 'node_types.npy', type_mask)\n",
    "# author labels\n",
    "np.save(save_prefix + 'labels.npy', labels)\n",
    "# author train/validation/test splits\n",
    "rand_seed = 1566911444\n",
    "train_idx, val_idx = train_test_split(np.arange(len(labels)), test_size=400, random_state=rand_seed)\n",
    "train_idx, test_idx = train_test_split(train_idx, test_size=3257, random_state=rand_seed)\n",
    "train_idx.sort()\n",
    "val_idx.sort()\n",
    "test_idx.sort()\n",
    "np.savez(save_prefix + 'train_val_test_idx.npz',\n",
    "         val_idx=val_idx,\n",
    "         train_idx=train_idx,\n",
    "         test_idx=test_idx)"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n",
     "is_executing": true
    }
   }
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "outputs": [],
   "source": [
    "# post-processing for mini-batched training\n",
    "target_idx_list = np.arange(4057)\n",
    "for metapath in [(0, 1, 0), (0, 1, 2, 1, 0), (0, 1, 3, 1, 0)]:\n",
    "    edge_metapath_idx_array = np.load(save_prefix + '{}/'.format(0) + '-'.join(map(str, metapath)) + '_idx.npy')\n",
    "    target_metapaths_mapping = {}\n",
    "    for target_idx in target_idx_list:\n",
    "        target_metapaths_mapping[target_idx] = edge_metapath_idx_array[edge_metapath_idx_array[:, 0] == target_idx][:, ::-1]\n",
    "    out_file = open(save_prefix + '{}/'.format(0) + '-'.join(map(str, metapath)) + '_idx.pickle', 'wb')\n",
    "    pickle.dump(target_metapaths_mapping, out_file)\n",
    "    out_file.close()"
   ],
   "metadata": {
    "collapsed": false,
    "pycharm": {
     "name": "#%%\n",
     "is_executing": false
    }
   }
  }
 ],
 "metadata": {
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.6"
  },
  "kernelspec": {
   "name": "python3",
   "language": "python",
   "display_name": "Python 3"
  },
  "pycharm": {
   "stem_cell": {
    "cell_type": "raw",
    "source": [],
    "metadata": {
     "collapsed": false
    }
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}